load('/data/nycdoe/clean_data/school_percentile_for_thoa.Rdata')
summary(school_percentile)
## dbn percentile rank
## Length:2036 Min. : 0.09934 Min. : 1.0
## Class :character 1st Qu.: 27.32366 1st Qu.: 509.8
## Mode :character Median : 48.87417 Median :1018.5
## Mean : 49.00269 Mean :1018.5
## 3rd Qu.: 70.77172 3rd Qu.:1527.2
## Max. :100.00000 Max. :2036.0
# bigger value for `rank` means worse school (percentile is lower). Highest percentile - 100 - is ranked number 1.
school_percentile %>%
filter(rank == 1)
## # A tibble: 1 x 3
## dbn percentile rank
## <chr> <dbl> <dbl>
## 1 75M444 100 1
load('/data/nycdoe/clean_data/avg_all_long.Rdata')
Note: we only care about students who put the school on their First choice in First round.
load("/data/nycdoe/clean_data/eightgrade_apps_long_w_dbn.Rdata")
summary(eightgrade_apps_long_w_dbn)
## year student_id_scram feeder_dbn grade_level
## Min. :2005 Length:9421536 Length:9421536 Min. :8
## 1st Qu.:2007 Class :character Class :character 1st Qu.:8
## Median :2010 Mode :character Mode :character Median :8
## Mean :2010 Mean :8
## 3rd Qu.:2013 3rd Qu.:8
## Max. :2015 Max. :8
## feeder_dbn(june) choice program_applied
## Length:9421536 Length:9421536 Length:9421536
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## dbn pgname school_name
## Length:9421536 Length:9421536 Length:9421536
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## interest method
## Length:9421536 Length:9421536
## Class :character Class :character
## Mode :character Mode :character
##
##
##
head(eightgrade_apps_long_w_dbn)
## # A tibble: 6 x 12
## year student_id_scram feeder_dbn grade_level `feeder_dbn(june)`
## <dbl> <chr> <chr> <int> <chr>
## 1 2005 489302451 10X206 8 10X206
## 2 2005 626402037 10X206 8 10X206
## 3 2005 843402069 10X206 8 10X206
## 4 2005 967502441 10X206 8 10X206
## 5 2005 330502342 10X206 8 10X206
## 6 2005 799502792 10X206 8 10X206
## # ... with 7 more variables: choice <chr>, program_applied <chr>,
## # dbn <chr>, pgname <chr>, school_name <chr>, interest <chr>,
## # method <chr>
# so this table has from r1programcode1 up to r1programcode12 only
tail(eightgrade_apps_long_w_dbn$choice)
## [1] "r1programcode12" "r1programcode12" "r1programcode12" "r1programcode12"
## [5] "r1programcode12" "r1programcode12"
# 104,378 rows (1.108%) missing school names. These come from 31 dbns (and programs) with "Missing" school names.
eightgrade_apps_long_w_dbn %>%
filter(school_name == "Missing") %>% distinct(dbn, program_applied)
## # A tibble: 31 x 2
## program_applied dbn
## <chr> <chr>
## 1 X74A 12X690
## 2 X67J 07X655
## 3 X40A 11X541
## 4 M56A 02M620
## 5 X82X 12X262
## 6 X60O 10X660
## 7 X54A 09X414
## 8 M24F 02M460
## 9 M72A 05M469
## 10 X12Z 11X415
## # ... with 21 more rows
# join the student's performance to the application data
rm(perschool_appquality)
## Warning in rm(perschool_appquality): object 'perschool_appquality' not
## found
perschool_appquality <-
eightgrade_apps_long_w_dbn %>%
filter(choice == "r1programcode1") %>% #because we only care about what students put in r1r1
group_by(year, dbn) %>%
select(year, dbn, school_name, program_applied, pgname, student_id_scram) %>%
inner_join(avg_all_percentile, by=c("student_id_scram", "year")) %>%
arrange(year, dbn, performance)
# thanks to `inner_join`, there is no row with missing student's performance
summary(perschool_appquality)
## year dbn school_name program_applied
## Min. :2005 Length:760199 Length:760199 Length:760199
## 1st Qu.:2007 Class :character Class :character Class :character
## Median :2010 Mode :character Mode :character Mode :character
## Mean :2010
## 3rd Qu.:2013
## Max. :2015
## pgname student_id_scram grade_level performance
## Length:760199 Length:760199 Min. :8 Min. : 0.00
## Class :character Class :character 1st Qu.:8 1st Qu.:26.06
## Mode :character Mode :character Median :8 Median :45.20
## Mean :8 Mean :45.66
## 3rd Qu.:8 3rd Qu.:64.76
## Max. :8 Max. :99.92
perschool_appquality_medianPerf <-
perschool_appquality %>%
group_by(year, dbn) %>%
summarize(median_stud_perf = median(performance),
mean_stud_perf = mean(performance),
top5_stud_perf = quantile(performance, 0.95))
summary(perschool_appquality_medianPerf)
## year dbn median_stud_perf mean_stud_perf
## Min. :2005 Length:4188 Min. : 1.977 Min. : 2.162
## 1st Qu.:2008 Class :character 1st Qu.:26.780 1st Qu.:29.441
## Median :2010 Mode :character Median :33.508 Median :35.222
## Mean :2010 Mean :35.170 Mean :36.685
## 3rd Qu.:2013 3rd Qu.:41.522 3rd Qu.:42.078
## Max. :2015 Max. :82.871 Max. :82.871
## top5_stud_perf
## Min. : 2.162
## 1st Qu.:61.963
## Median :70.003
## Mean :69.498
## 3rd Qu.:77.909
## Max. :96.923
Most schools receive applicants with performance in 20 - 40 (out of 100) range.
# plot it!
yr = 2015
(plot1 <-
perschool_appquality_medianPerf %>%
filter(year == yr) %>%
ggplot() +
geom_histogram(aes(x = median_stud_perf), bins = 80) +
labs(title = paste("Distribution of Applicant Quality among Schools in", yr),
x = "Median applicant's performance",
y = "Number of schools"))
ggsave(filename = "app_quality_2015.pdf", plot = plot1, width = 8, height = 6)
# join in percentile of the schools themselves
rm(temp)
## Warning in rm(temp): object 'temp' not found
temp <- perschool_appquality_medianPerf %>%
inner_join(school_percentile, by="dbn") %>%
rename("percentile_school" = percentile)
Across all years, there are 2,792 schools with info on applicants’ performance.
summary(temp)
## year dbn median_stud_perf mean_stud_perf
## Min. :2005 Length:4132 Min. : 1.977 Min. : 2.162
## 1st Qu.:2008 Class :character 1st Qu.:26.765 1st Qu.:29.429
## Median :2010 Mode :character Median :33.529 Median :35.220
## Mean :2010 Mean :35.198 Mean :36.701
## 3rd Qu.:2013 3rd Qu.:41.623 3rd Qu.:42.190
## Max. :2015 Max. :82.536 Max. :78.937
## top5_stud_perf percentile_school rank
## Min. : 2.162 Min. : 1.258 Min. : 7.0
## 1st Qu.:61.894 1st Qu.:32.900 1st Qu.: 556.0
## Median :69.932 Median :49.042 Median :1015.0
## Mean :69.474 Mean :50.904 Mean : 981.9
## 3rd Qu.:77.909 3rd Qu.:68.228 3rd Qu.:1404.0
## Max. :96.923 Max. :98.871 Max. :2019.0
The number of schools applied to steadily increases over years. From 300 in 2015 to more than 400 in 2015. (but note, the actual number could be higher, because these are just schools with available applicant’s performance.)
(plot0 <-
temp %>%
count(year) %>%
ggplot(aes(x = as.factor(year), y = n)) +
geom_point() +
geom_text(aes(label = n), vjust = -0.7) +
labs(title = "Number of High Schools Applied to over Years", x = "Year", y = "Number of High Schools",
subtitle = "Note: the actual number could be higher; these are just schools w/ available applicants' performance"))
ggsave(filename = "count_schools_applied_over_years.pdf", plot = plot0, width = 8, height = 6)
Better schools generally receive better applicants
# plot student's performance against school's percentile (note: there are 418 points in this plot)
(plot2 <-
temp %>%
filter(year == yr) %>%
ggplot(aes(y = median_stud_perf, x = percentile_school)) +
geom_point(size = 2) +
labs(title = paste("School percentile VS. Applicant quality in", yr),
x = "School percentile (better schools on the right)",
y = "Typical applicant quality (higher quality on top)") +
geom_smooth())
## `geom_smooth()` using method = 'loess'
ggsave(filename = "school_pctile_vs_applicant_quality_2015.pdf", plot = plot2, width = 8, height = 6)
## `geom_smooth()` using method = 'loess'
# load("/data/nycdoe/clean_data/eightgrade_apps_long_w_dbn.Rdata")
# summary(eightgrade_apps_long_w_dbn)
rm(eightgrade_apps_long_w_dbn_percentile)
## Warning in rm(eightgrade_apps_long_w_dbn_percentile): object
## 'eightgrade_apps_long_w_dbn_percentile' not found
eightgrade_apps_long_w_dbn_percentile <-
eightgrade_apps_long_w_dbn %>%
left_join(school_percentile, by="dbn") %>%
select(year, student_id_scram, choice, program_applied, dbn, "percentile_school" = percentile)
head(eightgrade_apps_long_w_dbn_percentile)
## # A tibble: 6 x 6
## year student_id_scram choice program_applied dbn
## <dbl> <chr> <chr> <chr> <chr>
## 1 2005 489302451 r1programcode1 X31A 07X548
## 2 2005 626402037 r1programcode1 X86R 10X268
## 3 2005 843402069 r1programcode1 X39A 10X549
## 4 2005 967502441 r1programcode1 X36A 09X403
## 5 2005 330502342 r1programcode1 X86R 10X268
## 6 2005 799502792 r1programcode1 M57B 02M542
## # ... with 1 more variables: percentile_school <dbl>
# among ~9.4 million applications, ~4.6 million don't have percentile_school
summary(eightgrade_apps_long_w_dbn_percentile)
## year student_id_scram choice program_applied
## Min. :2005 Length:9421536 Length:9421536 Length:9421536
## 1st Qu.:2007 Class :character Class :character Class :character
## Median :2010 Mode :character Mode :character Mode :character
## Mean :2010
## 3rd Qu.:2013
## Max. :2015
##
## dbn percentile_school
## Length:9421536 Min. : 1
## Class :character 1st Qu.:44
## Mode :character Median :67
## Mean :63
## 3rd Qu.:81
## Max. :99
## NA's :4617901
# # sanity check: anyone not list choice1 but list choice2, etc.? YES! There are a good number of people who do this.
# eightgrade_apps_long_w_dbn %>%
# mutate(foo = is.na(program_applied)) %>%
# #select(year, student_id_scram, choice, program_applied, foo) %>%
# filter(year == 2005) %>%
# arrange(student_id_scram) %>%
# group_by(student_id_scram) %>%
# summarize(bar = length(rle(foo)),
# baz = sum(lag(foo, default=0) != foo)) %>%
# filter(bar > 2 | baz > 1)
#
# # Because there are so many cases like this, we now only care that applicants MUST NOT SKIP TOP1 CHOICE if they ever list any choice at all.
# eightgrade_apps_long_w_dbn_notskip_r1r1 <-
# eightgrade_apps_long_w_dbn %>%
# group_by(year, student_id_scram) %>%
# mutate(foo = is.na(program_applied)) %>%
# #select(year, student_id_scram, choice, program_applied, foo) %>%
# filter(year == 2005) %>%
# arrange(student_id_scram) %>%
# group_by(student_id_scram)
# who_skip_r1r1 <-
# eightgrade_apps_long_w_dbn %>%
# filter(choice == "r1pgrogramcode1" & is.na(program_applied)) %>%
# left_join(eightgrade_apps_long_w_dbn, by=c("year", "student_id_scram"))
#
# eightgrade_apps_long_w_dbn[which(eightgrade_apps_long_w_dbn$choice == "r1programcode1")]
#
# eightgrade_apps_long_w_dbn %>%
# filter(student_id_scram == "102112789")
# compute "application score"
perapp_aspiration <-
eightgrade_apps_long_w_dbn_percentile %>%
group_by(year, student_id_scram) %>%
summarize(schoolpercentile_of_topchoice = percentile_school[1],
median_schoolpercentile = median(percentile_school, na.rm=TRUE))
head(perapp_aspiration)
## # A tibble: 6 x 4
## # Groups: year [1]
## year student_id_scram schoolpercentile_of_topchoice
## <dbl> <chr> <dbl>
## 1 2005 100162233 42.68126
## 2 2005 100202069 72.81122
## 3 2005 100262015 66.64103
## 4 2005 100262063 NA
## 5 2005 100262096 35.29412
## 6 2005 100262143 86.73051
## # ... with 1 more variables: median_schoolpercentile <dbl>
summary(perapp_aspiration)
## year student_id_scram schoolpercentile_of_topchoice
## Min. :2005 Length:785128 Min. : 1.26
## 1st Qu.:2007 Class :character 1st Qu.:50.07
## Median :2010 Mode :character Median :70.45
## Mean :2010 Mean :67.23
## 3rd Qu.:2013 3rd Qu.:84.71
## Max. :2015 Max. :98.87
## NA's :84011
## median_schoolpercentile
## Min. : 1.26
## 1st Qu.:49.59
## Median :67.51
## Mean :63.98
## 3rd Qu.:77.96
## Max. :99.04
## NA's :36597
# missing_topchoice <-
# eightgrade_apps_long_w_dbn_percentile %>%
# filter(year == 2010) %>%
# group_by(student_id_scram) %>%
# summarize(schoolpercentile_of_topchoice = percentile_school[1]) %>%
# filter(is.na(schoolpercentile_of_topchoice))
#
# eightgrade_apps_long_w_dbn_percentile %>%
# filter(year == 2010 & student_id_scram %in% missing_topchoice$student_id_scram & choice == "r1programcode1") %>%
# count(is.na(program_applied))
# #
# 47,414 rows like this, so it seems missing topchoice-percentile does NOT cover missing median-percentile
perapp_aspiration %>%
filter(is.na(schoolpercentile_of_topchoice) & !is.na(median_schoolpercentile))
## # A tibble: 47,414 x 4
## # Groups: year [11]
## year student_id_scram schoolpercentile_of_topchoice
## <dbl> <chr> <dbl>
## 1 2005 100262063 NA
## 2 2005 100362307 NA
## 3 2005 100462043 NA
## 4 2005 100462514 NA
## 5 2005 100462613 NA
## 6 2005 100462905 NA
## 7 2005 100472674 NA
## 8 2005 100652081 NA
## 9 2005 100762013 NA
## 10 2005 100762089 NA
## # ... with 47,404 more rows, and 1 more variables:
## # median_schoolpercentile <dbl>
# for now, we will just drop all NA, no matter which column it is in (~84k rows are dropped)
perapp_aspiration_dropNA <-
perapp_aspiration %>%
filter(!is.na(schoolpercentile_of_topchoice) & !is.na(median_schoolpercentile))
summary(perapp_aspiration_dropNA)
## year student_id_scram schoolpercentile_of_topchoice
## Min. :2005 Length:701117 Min. : 1.258
## 1st Qu.:2008 Class :character 1st Qu.:50.068
## Median :2010 Mode :character Median :70.451
## Mean :2010 Mean :67.226
## 3rd Qu.:2013 3rd Qu.:84.713
## Max. :2015 Max. :98.871
## median_schoolpercentile
## Min. : 1.258
## 1st Qu.:50.376
## Median :67.819
## Mean :64.584
## 3rd Qu.:78.762
## Max. :98.871
# plot it:
yr = 2015
# method 1: "application score" = schoolpercentile_of_topchoice
(plot3 <-
perapp_aspiration_dropNA %>%
filter(year == yr) %>%
ggplot() +
geom_histogram(aes(x = schoolpercentile_of_topchoice), bins = 40) +
labs(title = paste("Distribution of Aspiration in", yr),
subtitle = "Method 1: 'application score' = schoolpercentile_of_topchoice",
x = "Application's aspiration (based on the percentile of their top1 school)",
y = "Number of students at this level") +
scale_y_continuous(labels = scales::comma))
ggsave(filename = "ambitious_bytop1school_2015.pdf", plot = plot3, width = 8, height = 6)
# method 2: "application score" = median_schoolpercentile
(plot4 <-
perapp_aspiration_dropNA %>%
filter(year == yr) %>%
ggplot() +
geom_histogram(aes(x = median_schoolpercentile), bins = 40) +
labs(title = paste("Distribution of Aspiration in", yr),
subtitle = "Method 2: 'application score' = median_schoolpercentile",
x = "Application's aspiration (based on the median percentile of all schools on list)",
y = "Number of students at this level") +
scale_y_continuous(labels = scales::comma))
ggsave(filename = "ambitious_bymedian_2015.pdf", plot = plot4, width = 8, height = 6)
rm(aspiration_vs_perf)
## Warning in rm(aspiration_vs_perf): object 'aspiration_vs_perf' not found
aspiration_vs_perf <-
perapp_aspiration_dropNA %>%
left_join(avg_all_percentile, by=c("student_id_scram", "year"))
summary(aspiration_vs_perf)
## year student_id_scram schoolpercentile_of_topchoice
## Min. :2005 Length:701117 Min. : 1.258
## 1st Qu.:2008 Class :character 1st Qu.:50.068
## Median :2010 Mode :character Median :70.451
## Mean :2010 Mean :67.226
## 3rd Qu.:2013 3rd Qu.:84.713
## Max. :2015 Max. :98.871
##
## median_schoolpercentile grade_level performance
## Min. : 1.258 Min. :8 Min. : 0.00
## 1st Qu.:50.376 1st Qu.:8 1st Qu.:27.16
## Median :67.819 Median :8 Median :46.34
## Mean :64.584 Mean :8 Mean :46.60
## 3rd Qu.:78.762 3rd Qu.:8 3rd Qu.:65.71
## Max. :98.871 Max. :8 Max. :99.92
## NA's :12576 NA's :12576
head(aspiration_vs_perf)
## # A tibble: 6 x 6
## # Groups: year [1]
## year student_id_scram schoolpercentile_of_topchoice
## <dbl> <chr> <dbl>
## 1 2005 100162233 42.68126
## 2 2005 100202069 72.81122
## 3 2005 100262015 66.64103
## 4 2005 100262096 35.29412
## 5 2005 100262143 86.73051
## 6 2005 100262355 77.94118
## # ... with 3 more variables: median_schoolpercentile <dbl>,
## # grade_level <int>, performance <dbl>
tail(aspiration_vs_perf)
## # A tibble: 6 x 6
## # Groups: year [1]
## year student_id_scram schoolpercentile_of_topchoice
## <dbl> <chr> <dbl>
## 1 2015 999902058 69.42544
## 2 2015 999902280 85.67031
## 3 2015 999902628 55.09576
## 4 2015 999902751 78.86457
## 5 2015 999912646 96.71683
## 6 2015 999912769 93.67305
## # ... with 3 more variables: median_schoolpercentile <dbl>,
## # grade_level <int>, performance <dbl>
# there are 12,576 students (1.79%) with NA in performance. We will drop these.
aspiration_vs_perf <-
aspiration_vs_perf %>%
filter(!is.na(performance))
# plot it!
yr = 2015
# method 1: "application score" = schoolpercentile_of_topchoice
aspiration_vs_perf %>%
filter(year == yr) %>%
ggplot() +
geom_point(aes(x = performance, y = schoolpercentile_of_topchoice), alpha = 0.2, color = "dark blue") +
labs(title = paste("Aspiration vs. Performance in", yr),
subtitle = "Method 1: 'application score' = schoolpercentile_of_topchoice",
y = "Application's aspiration (based on the percentile of their top1 school)",
x = "Applicant's 8th-grade performance ranking")
# method 1 : improved version 1
aspiration_vs_perf %>%
group_by(year, binned_studPerf = round(performance)/10) %>%
summarise(mean_schoolpercentile_of_topchoice = mean(schoolpercentile_of_topchoice)) %>%
filter(year == yr) %>%
ggplot() +
geom_point(aes(x = binned_studPerf, y = mean_schoolpercentile_of_topchoice), color = "dark blue") +
labs(title = paste("Aspiration vs. Performance in", yr),
subtitle = "Method 1: 'application score' = schoolpercentile_of_topchoice",
y = "Application's aspiration (based on the percentile of their top1 school)",
x = "Applicant's 8th-grade performance ranking")
# method 1 : improved version 2 (seems a bit more nuanced)
(plot5 <-
aspiration_vs_perf %>%
mutate(binned_studPerf = as.factor(round(performance/10))) %>%
filter(year == yr) %>%
ggplot() +
geom_boxplot(aes(x = binned_studPerf, y = schoolpercentile_of_topchoice), alpha = 0.2, color = "dark blue") +
labs(title = paste("Aspiration vs. Performance in", yr),
subtitle = "Method 1: 'application score' = schoolpercentile_of_topchoice",
y = "Application's aspiration (based on the percentile of their top1 school)",
x = "Applicant's 8th-grade performance ranking"))
ggsave(filename = "ambitious_against_perf_2015_bytop1choice.pdf", plot = plot5, width = 8, height = 6)
# method 2: "application score" = median_schoolpercentile
aspiration_vs_perf %>%
filter(year == yr) %>%
ggplot() +
geom_point(aes(x = performance, y = median_schoolpercentile), alpha = 0.2, color = "dark blue") +
labs(title = paste("Aspiration vs. Performance in", yr),
subtitle = "Method 2: 'application score' = median_schoolpercentile",
y = "Application's aspiration (based on the median percentile of schools on list)",
x = "Applicant's 8th-grade percentile (based on GPA)")
# method 2: improved version 1
aspiration_vs_perf %>%
group_by(year, binned_studPerf = round(performance)/10) %>%
summarise(mean_median_schoolpercentile = mean(median_schoolpercentile)) %>%
filter(year == yr) %>%
ggplot() +
geom_point(aes(x = binned_studPerf, y = mean_median_schoolpercentile), color = "dark blue") +
labs(title = paste("Aspiration vs. Performance in", yr),
subtitle = "Method 2: 'application score' = median_schoolpercentile",
y = "Application's aspiration (based on the median percentile of schools on list)",
x = "Applicant's 8th-grade performance ranking")
# method 2: improved version 2 (seems a bit more nuanced)
(plot6 <-
aspiration_vs_perf %>%
mutate(binned_studPerf = as.factor(round(performance/10))) %>%
filter(year == yr) %>%
ggplot() +
geom_boxplot(aes(x = binned_studPerf, y = median_schoolpercentile), alpha = 0.2, color = "dark blue") +
labs(title = paste("Aspiration vs. Performance in", yr),
subtitle = "Method 2: 'application score' = median_schoolpercentile",
y = "Application's aspiration (based on the median percentile of schools on list)",
x = "Applicant's 8th-grade performance ranking"))
ggsave(filename = "ambitious_against_perf_2015_bymedian.pdf", plot = plot6, width = 8, height = 6)